In [154]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches #for custom legends
import seaborn as sns
from lifelines import KaplanMeierFitter #survival analysis library
from lifelines.statistics import logrank_test #survival statistical testing
from IPython.display import Image
from IPython.core.display import HTML 
%matplotlib inline



In [155]:

    
def display_all(df):
    with pd.option_context("display.max_rows", 1000): 
        with pd.option_context("display.max_columns", 1000): 
            display(df)



In [3]:

    
url="https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv?cm_mc_uid=51304980933215218170416&cm_mc_sid_50200000=92178841521817041648&cm_mc_sid_52640000=98592221521817041652"
df = pd.read_csv(url)
print(df.shape)
display_all(df.tail().transpose())









    



(7043, 21)






    







  
    
      
      7038
      7039
      7040
      7041
      7042
    
  
  
    
      customerID
      6840-RESVB
      2234-XADUH
      4801-JZAZL
      8361-LTMKD
      3186-AJIEK
    
    
      gender
      Male
      Female
      Female
      Male
      Male
    
    
      SeniorCitizen
      0
      0
      0
      1
      0
    
    
      Partner
      Yes
      Yes
      Yes
      Yes
      No
    
    
      Dependents
      Yes
      Yes
      Yes
      No
      No
    
    
      tenure
      24
      72
      11
      4
      66
    
    
      PhoneService
      Yes
      Yes
      No
      Yes
      Yes
    
    
      MultipleLines
      Yes
      Yes
      No phone service
      Yes
      No
    
    
      InternetService
      DSL
      Fiber optic
      DSL
      Fiber optic
      Fiber optic
    
    
      OnlineSecurity
      Yes
      No
      Yes
      No
      Yes
    
    
      OnlineBackup
      No
      Yes
      No
      No
      No
    
    
      DeviceProtection
      Yes
      Yes
      No
      No
      Yes
    
    
      TechSupport
      Yes
      No
      No
      No
      Yes
    
    
      StreamingTV
      Yes
      Yes
      No
      No
      Yes
    
    
      StreamingMovies
      Yes
      Yes
      No
      No
      Yes
    
    
      Contract
      One year
      One year
      Month-to-month
      Month-to-month
      Two year
    
    
      PaperlessBilling
      Yes
      Yes
      Yes
      Yes
      Yes
    
    
      PaymentMethod
      Mailed check
      Credit card (automatic)
      Electronic check
      Mailed check
      Bank transfer (automatic)
    
    
      MonthlyCharges
      84.8
      103.2
      29.6
      74.4
      105.65
    
    
      TotalCharges
      1990.5
      7362.9
      346.45
      306.6
      6844.5
    
    
      Churn
      No
      No
      No
      Yes
      No



In [4]:

    
df["b_Churn"] = df.Churn.apply(lambda x: 1 if x == "Yes" else 0) #recode churn var



In [5]:

    
df.MultipleLines.value_counts()









    Out[5]:





No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64



In [6]:

    
#drop "No phone service"
df[df.MultipleLines != "No phone service"]

#recode MultipleLines var to get our two comparison cohorts
df["b_MultipleLines"] = df.MultipleLines.apply(lambda x: 1 if x == "Yes" else 0)



In [42]:

    
#this is not time series data, so we will assume all customers start at t = 0 
# and continue until t = tenure
df.tenure.describe()









    Out[42]:





count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64

Kaplan-Meier

For background, see Rich et al (2010), Singh et all (2011)

Estimates the “survival function” via the for a cohort of subjects
Does not provide an estimate of the magnitude of the difference in survival for the cohorts being compared
Each observation (case, customer) should have one "birth" (activation) and one "death" (churn)



In [35]:

    
kmf = KaplanMeierFitter()
T = df['tenure'] #duration
C = df["b_Churn"] #censorship - 1 if death/churn is seen, 0 if censored



In [157]:

    
palette = ["windows blue", "amber"]
sns.set_palette(sns.xkcd_palette(palette))

##SET UP PLOT
ax = plt.subplot(111)
plt.title('Kaplan-Meier Estimate of Driver Retention by Multiple Lines')
sns.set_context("talk")

d={} #to store the models
vlines = []
i=0

##PLOT FITTED GRAPH
#loop through segmentation variable, plot on same axes
for segment in df.b_MultipleLines.unique(): 
    ix = df.b_MultipleLines == segment
    d['kmf{}'.format(i+1)] = kmf.fit(T.loc[ix],C.loc[ix], label=segment)
    ax = kmf.plot(ax=ax, figsize=(12,6))

    ax.set_xlim([T.min(),T.max()])
    ax.set_ylim([0.5,1])
    
    y_ = kmf.survival_function_[kmf.survival_function_.round(2) ==.75].dropna().index[0]
    ymax_ = kmf.survival_function_[kmf.survival_function_.round(2) ==.75].dropna()[i][y_]
    
    vlines.append([y_, ymax_])
    i+=1

##PLOT ANNOTATION
#for each intercept, plot a horizontal and a vertical line up to the fitted curve
xmin_=0
for i, xy in enumerate(vlines):
    xmax_ = xy[0]
    color="xkcd:{}".format(palette[i])
    
    plt.axvline(x=xy[0], ymax=.5, alpha=.8, color=color, linestyle='dotted') #axes fractional
    plt.hlines(y=.75, xmin=xmin_, xmax=xmax_, alpha=.8, color=color, linestyle='dotted') #axes data coordinates
    xmin_ += xmax_ #to avoid overlapping hlines

## position text label
difference = vlines[1][0]-vlines[0][0]
label_frac_x = (vlines[0][0]+(difference)/2)/T.max()-.07 #midpoint (minus .07 centering factor)
label_frac_y = 0.2

#label first line
ax.annotate('Difference:\n{} Months'.format(difference), 
  xy=(vlines[0][0],.62), xycoords='data', color='white',
  xytext=(label_frac_x,label_frac_y), textcoords='axes fraction',
  arrowprops=dict(arrowstyle="-|>", #arrows removed for reability
                  fc="k", ec="k")
           )

#label second line    
ax.annotate('Difference:\n{} Months'.format(difference), 
      xy=(vlines[1][0],.62), xycoords='data', color='black',
      xytext=(label_frac_x, label_frac_y), textcoords='axes fraction',
      arrowprops=dict(arrowstyle="-|>", #arrows removed for reability
                      fc="k", ec='k')
           )

##LEGEND
#override default legend
patches = [ mpatches.Patch(color="xkcd:windows blue", label='Has One Phoneline'),
            mpatches.Patch(color="xkcd:amber", label='Has Multiple Phonelines')
          ]
plt.legend(handles=[patches[0],patches[1]], title="User Segmentation", loc='best');

The graph clearly shows that there is a difference in tenure between "single line" and "multiple line" telco users. Since the confidence interals also appear on the graph, we can see that the difference is statistically significant from t > 0 until approximately t= 52 months, after which the churn rates becomes the same, within the margin of error.

Our survival curves do not follow the constant proportionality assumption (they cannot because they cross, and we can also see varying distances between the curves throughout). Conventionally, ths would mean that a logrank test is not an appropriate measure for estimating the hazard ratio. This presentation by Stone (2015) makes a compelling argument that hazard ratios on nonproportionality are still readily interpretable. This has, however, been a source of controversy in the biostatistics field. Alternative methods such as time-splitting have been proposed, see: Bouliotis & Billingham (2011)

To get the hazard ratio, we run the logrank test.



In [138]:

    
multi = (df["b_MultipleLines"] == 1)
results = logrank_test(T[multi], T[~multi], C[multi], C[~multi], alpha=0.99 ) #at 99% confidence level
results.print_summary()









    



df=1, alpha=0.99, null_distribution=chi squared, t_0=-1

test_statistic      p     
       24.8504 0.0000  ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1



In [153]:

    
# Image Credit: https://en.wikipedia.org/wiki/Log-rank_test
Image(url= "https://wikimedia.org/api/rest_v1/media/math/render/svg/68750c8d50ac1d055b3ddcc67a05de43b40fdebf")









    Out[153]:



In [152]:

    
Z = results.test_statistic
D = C.sum() #number of events observed

hazard_ratio = np.exp(Z*np.sqrt(4/D))
print(hazard_ratio)









    



3.15703557409

Therefore the risk of churn is 3.2 times higher in users with only one phoneline compared to users with multiple lines.



In [ ]:

	7038	7039	7040	7041	7042
customerID	6840-RESVB	2234-XADUH	4801-JZAZL	8361-LTMKD	3186-AJIEK
gender	Male	Female	Female	Male	Male
SeniorCitizen	0	0	0	1	0
Partner	Yes	Yes	Yes	Yes	No
Dependents	Yes	Yes	Yes	No	No
tenure	24	72	11	4	66
PhoneService	Yes	Yes	No	Yes	Yes
MultipleLines	Yes	Yes	No phone service	Yes	No
InternetService	DSL	Fiber optic	DSL	Fiber optic	Fiber optic
OnlineSecurity	Yes	No	Yes	No	Yes
OnlineBackup	No	Yes	No	No	No
DeviceProtection	Yes	Yes	No	No	Yes
TechSupport	Yes	No	No	No	Yes
StreamingTV	Yes	Yes	No	No	Yes
StreamingMovies	Yes	Yes	No	No	Yes
Contract	One year	One year	Month-to-month	Month-to-month	Two year
PaperlessBilling	Yes	Yes	Yes	Yes	Yes
PaymentMethod	Mailed check	Credit card (automatic)	Electronic check	Mailed check	Bank transfer (automatic)
MonthlyCharges	84.8	103.2	29.6	74.4	105.65
TotalCharges	1990.5	7362.9	346.45	306.6	6844.5
Churn	No	No	No	Yes	No